\(~\)

Loading libraries

# For Problem 3
library(MASS)
library(matrixcalc)
library(GGally)

# For all Problems
library(ggplot2)
library(tidyverse)
library(hrbrthemes)

Problem 3 Kaggle Competition

You are to compete in the House Prices: Advanced Regression Techniques competition https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following:

\(~\)

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
# Read csv file renaming train.csv to house_train
house_train <- read.csv('/Users/letiix3/Documents/Data Sciene MS Program/Data-605/Week-15/Submission_Prediction/train.csv', header = T, sep = ",")
head(house_train[1:15], n = 5)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm
# glimpse of data set columns 1 - 10
glimpse(house_train[1:10])
## Rows: 1,460
## Columns: 10
## $ Id          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ MSSubClass  <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20, 2…
## $ MSZoning    <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "RL"…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, NA…
## $ LotArea     <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120,…
## $ Street      <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "P…
## $ Alley       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ LotShape    <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", "R…
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "L…
## $ Utilities   <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
# summary of data set with filter of numeric values only
house_train %>%
  select_if(is.numeric) %>%
  summary()
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 

\(~\)

# plot to see overview of SalePrice
house_train%>%
  ggplot( aes(x = SalePrice)) +
    geom_histogram( binwidth = 30,  fill = "pink", color = "#e9ecef", alpha = 0.9) +
    ggtitle("Histogram of 'SalePrice'") +
    theme_ipsum() +
    theme(
      plot.title = element_text(size = 15)
    )

# selecting random columns to create a Correlogram
house_train%>%
ggpairs(columns = 44:53, ggplot2::aes(colour = 'species')) 

\(~\)

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
colnames(house_train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
# plot for SalePrice and LotArea
house_train %>%
ggplot(aes(x = SalePrice, y = LotArea, color = YearBuilt)) + 
    geom_point(size = 3) +
    geom_smooth(method = "lm", color = 'red') +
    theme_ipsum()
## `geom_smooth()` using formula 'y ~ x'

# plot for YearBuilt and SalePrice
house_train %>%
ggplot(aes(x = YearBuilt, y = SalePrice, color = YrSold)) + 
    geom_point(size = 3) +
    geom_smooth(method = "lm", color = 'red') +
    theme_ipsum()
## `geom_smooth()` using formula 'y ~ x'

\(~\)

Derive a correlation matrix for any three quantitative variables in the dataset.
cor_house <- house_train %>%
  dplyr::select(LotArea, YearBuilt, X1stFlrSF)
cor_m <- cor(cor_house)
cor_m
##              LotArea  YearBuilt X1stFlrSF
## LotArea   1.00000000 0.01422765 0.2994746
## YearBuilt 0.01422765 1.00000000 0.2819859
## X1stFlrSF 0.29947458 0.28198586 1.0000000

\(~\)

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Based on the tests below, we can reject the null hypothesis that states that the correlation is zero and accept the alternative hypothesis. We should be worried about familywise error.

cor.test(house_train$LotArea, house_train$YearBuilt, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$LotArea and house_train$YearBuilt
## t = 0.54332, df = 1458, p-value = 0.587
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.01934322  0.04776648
## sample estimates:
##        cor 
## 0.01422765
cor.test(house_train$LotArea, house_train$X1stFlrSF, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$LotArea and house_train$X1stFlrSF
## t = 11.985, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2686127 0.3297222
## sample estimates:
##       cor 
## 0.2994746
cor.test(house_train$X1stFlrSF, house_train$YearBuilt, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$X1stFlrSF and house_train$YearBuilt
## t = 11.223, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2507977 0.3125892
## sample estimates:
##       cor 
## 0.2819859

\(~\)

Linear Algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points
# invert correlation matrix from above
invert_cor <- solve(cor_m)
invert_cor
##              LotArea  YearBuilt  X1stFlrSF
## LotArea    1.1050234  0.0842977 -0.3546972
## YearBuilt  0.0842977  1.0928157 -0.3334036
## X1stFlrSF -0.3546972 -0.3334036  1.2002379
# precision
precision_cor <- round(cor_m %*% invert_cor)
precision_cor
##           LotArea YearBuilt X1stFlrSF
## LotArea         1         0         0
## YearBuilt       0         1         0
## X1stFlrSF       0         0         1
# LU decomposition
lu_decom_cor <- lu.decomposition(cor_m)
lu_decom_cor
## $L
##            [,1]      [,2] [,3]
## [1,] 1.00000000 0.0000000    0
## [2,] 0.01422765 1.0000000    0
## [3,] 0.29947458 0.2777813    1
## 
## $U
##      [,1]       [,2]      [,3]
## [1,]    1 0.01422765 0.2994746
## [2,]    0 0.99979757 0.2777250
## [3,]    0 0.00000000 0.8331682

\(~\)

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points
# Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
house_train%>%
  filter(OpenPorchSF < 350) %>%
  ggplot( aes(x = OpenPorchSF)) +
    geom_histogram( binwidth = 15,  fill = "purple", color = "#e9ecef", alpha = 0.9) +
    ggtitle("Histogram of 'OpenPorchSF'") +
    theme_ipsum() +
    theme(
      plot.title = element_text(size=15)
    )

# exponential distribution
expon_OpenPorchSF <- fitdistr(house_train$OpenPorchSF, 'exponential')
# lambda
lamb_OpenPorchSF <- expon_OpenPorchSF$estimate
lamb_OpenPorchSF
##       rate 
## 0.02143151
# sample of 1000
exp_samp <- rexp(1000, lamb_OpenPorchSF)
summary(exp_samp)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0764  13.9497  34.3816  49.2193  69.8457 375.4303
# Plot histogram and compare it with original histogram
hist(exp_samp, main = "Histogram of Exponential Sample of 'OpenPorchSF'")

\(~\)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
# 5th and 95th percentiles
lower <- qexp(0.05, lamb_OpenPorchSF)
lower
## [1] 2.393359
upper <- qexp(0.95, lamb_OpenPorchSF)
upper
## [1] 139.7817
# empirical 5th and 95th percentile
quantile(house_train$OpenPorchSF, c(0.05, 0.95))
##     5%    95% 
##   0.00 175.05
print(paste('The 5th percentile is 0.00 and the 95th percentile is 175.05.'))
## [1] "The 5th percentile is 0.00 and the 95th percentile is 175.05."

\(~\)

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com username and score. 10 points
# selection columns that are numeric only
house_train2 <- house_train %>% 
  dplyr::select_if(is.numeric)
# Check for missing values in data 
colSums(is.na(house_train2))
##            Id    MSSubClass   LotFrontage       LotArea   OverallQual 
##             0             0           259             0             0 
##   OverallCond     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1 
##             0             0             0             8             0 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF     X1stFlrSF     X2ndFlrSF 
##             0             0             0             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces 
##             0             0             0             0             0 
##   GarageYrBlt    GarageCars    GarageArea    WoodDeckSF   OpenPorchSF 
##            81             0             0             0             0 
## EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea       MiscVal 
##             0             0             0             0             0 
##        MoSold        YrSold     SalePrice 
##             0             0             0
model_house <- lm(SalePrice ~., house_train2)
summary(model_house)
## 
## Call:
## lm(formula = SalePrice ~ ., data = house_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442182  -16955   -2824   15125  318183 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.351e+05  1.701e+06  -0.197 0.843909    
## Id            -1.205e+00  2.658e+00  -0.453 0.650332    
## MSSubClass    -2.001e+02  3.451e+01  -5.797 8.84e-09 ***
## LotFrontage   -1.160e+02  6.126e+01  -1.894 0.058503 .  
## LotArea        5.422e-01  1.575e-01   3.442 0.000599 ***
## OverallQual    1.866e+04  1.482e+03  12.592  < 2e-16 ***
## OverallCond    5.239e+03  1.368e+03   3.830 0.000135 ***
## YearBuilt      3.164e+02  8.766e+01   3.610 0.000321 ***
## YearRemodAdd   1.194e+02  8.668e+01   1.378 0.168607    
## MasVnrArea     3.141e+01  7.022e+00   4.473 8.54e-06 ***
## BsmtFinSF1     1.736e+01  5.838e+00   2.973 0.003014 ** 
## BsmtFinSF2     8.342e+00  8.766e+00   0.952 0.341532    
## BsmtUnfSF      5.005e+00  5.277e+00   0.948 0.343173    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.597e+01  7.360e+00   6.246 6.02e-10 ***
## X2ndFlrSF      4.663e+01  6.102e+00   7.641 4.72e-14 ***
## LowQualFinSF   3.341e+01  2.794e+01   1.196 0.232009    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.043e+03  3.198e+03   2.828 0.004776 ** 
## BsmtHalfBath   2.465e+03  5.073e+03   0.486 0.627135    
## FullBath       5.433e+03  3.531e+03   1.539 0.124182    
## HalfBath      -1.098e+03  3.321e+03  -0.331 0.740945    
## BedroomAbvGr  -1.022e+04  2.155e+03  -4.742 2.40e-06 ***
## KitchenAbvGr  -2.202e+04  6.710e+03  -3.282 0.001063 ** 
## TotRmsAbvGrd   5.464e+03  1.487e+03   3.674 0.000251 ***
## Fireplaces     4.372e+03  2.189e+03   1.998 0.046020 *  
## GarageYrBlt   -4.728e+01  9.106e+01  -0.519 0.603742    
## GarageCars     1.685e+04  3.491e+03   4.827 1.58e-06 ***
## GarageArea     6.274e+00  1.213e+01   0.517 0.605002    
## WoodDeckSF     2.144e+01  1.002e+01   2.139 0.032662 *  
## OpenPorchSF   -2.252e+00  1.949e+01  -0.116 0.907998    
## EnclosedPorch  7.295e+00  2.062e+01   0.354 0.723590    
## X3SsnPorch     3.349e+01  3.758e+01   0.891 0.373163    
## ScreenPorch    5.805e+01  2.041e+01   2.844 0.004532 ** 
## PoolArea      -6.052e+01  2.990e+01  -2.024 0.043204 *  
## MiscVal       -3.761e+00  6.960e+00  -0.540 0.589016    
## MoSold        -2.217e+02  4.229e+02  -0.524 0.600188    
## YrSold        -2.474e+02  8.458e+02  -0.293 0.769917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36800 on 1085 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8096, Adjusted R-squared:  0.8034 
## F-statistic: 131.8 on 35 and 1085 DF,  p-value: < 2.2e-16
plot(model_house)

model_house2 <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea +  OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, house_train2)
summary(model_house2)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + 
##     X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + 
##     FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + 
##     PoolArea + MiscVal + MoSold + YrSold, data = house_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442865  -16873   -2581   14998  318042 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.232e+05  1.701e+06  -0.190 0.849317    
## MSSubClass    -2.005e+02  3.449e+01  -5.814 8.03e-09 ***
## LotFrontage   -1.161e+02  6.124e+01  -1.896 0.058203 .  
## LotArea        5.454e-01  1.573e-01   3.466 0.000548 ***
## OverallQual    1.870e+04  1.478e+03  12.646  < 2e-16 ***
## OverallCond    5.227e+03  1.367e+03   3.824 0.000139 ***
## YearBuilt      3.170e+02  8.762e+01   3.617 0.000311 ***
## YearRemodAdd   1.206e+02  8.661e+01   1.392 0.164174    
## MasVnrArea     3.160e+01  7.006e+00   4.511 7.15e-06 ***
## BsmtFinSF1     1.739e+01  5.835e+00   2.980 0.002947 ** 
## BsmtFinSF2     8.362e+00  8.763e+00   0.954 0.340205    
## BsmtUnfSF      5.006e+00  5.275e+00   0.949 0.342890    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.591e+01  7.356e+00   6.241 6.21e-10 ***
## X2ndFlrSF      4.668e+01  6.099e+00   7.654 4.28e-14 ***
## LowQualFinSF   3.415e+01  2.788e+01   1.225 0.220788    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   8.980e+03  3.194e+03   2.812 0.005018 ** 
## BsmtHalfBath   2.490e+03  5.071e+03   0.491 0.623487    
## FullBath       5.390e+03  3.529e+03   1.527 0.126941    
## HalfBath      -1.119e+03  3.320e+03  -0.337 0.736244    
## BedroomAbvGr  -1.023e+04  2.154e+03  -4.750 2.30e-06 ***
## KitchenAbvGr  -2.193e+04  6.704e+03  -3.271 0.001105 ** 
## TotRmsAbvGrd   5.440e+03  1.486e+03   3.661 0.000263 ***
## Fireplaces     4.375e+03  2.188e+03   2.000 0.045793 *  
## GarageYrBlt   -4.914e+01  9.093e+01  -0.540 0.589011    
## GarageCars     1.679e+04  3.487e+03   4.815 1.68e-06 ***
## GarageArea     6.488e+00  1.211e+01   0.536 0.592338    
## WoodDeckSF     2.155e+01  1.002e+01   2.151 0.031713 *  
## OpenPorchSF   -2.315e+00  1.948e+01  -0.119 0.905404    
## EnclosedPorch  7.233e+00  2.061e+01   0.351 0.725733    
## X3SsnPorch     3.458e+01  3.749e+01   0.922 0.356593    
## ScreenPorch    5.797e+01  2.040e+01   2.842 0.004572 ** 
## PoolArea      -6.126e+01  2.984e+01  -2.053 0.040326 *  
## MiscVal       -3.850e+00  6.955e+00  -0.554 0.579980    
## MoSold        -2.240e+02  4.227e+02  -0.530 0.596213    
## YrSold        -2.536e+02  8.454e+02  -0.300 0.764216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36790 on 1086 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8095, Adjusted R-squared:  0.8036 
## F-statistic: 135.7 on 34 and 1086 DF,  p-value: < 2.2e-16
plot(model_house2)

# Using only significant columns
model_house3 <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea +  OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, house_train2)
summary(model_house3)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + 
##     X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + 
##     TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch + 
##     PoolArea, data = house_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -451835  -17454   -2047   14390  317137 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.454e+05  1.003e+05  -8.427  < 2e-16 ***
## MSSubClass   -1.821e+02  3.184e+01  -5.719 1.36e-08 ***
## LotFrontage  -9.469e+01  5.782e+01  -1.638 0.101765    
## LotArea       5.560e-01  1.544e-01   3.602 0.000329 ***
## OverallQual   1.897e+04  1.323e+03  14.339  < 2e-16 ***
## OverallCond   5.179e+03  1.105e+03   4.687 3.10e-06 ***
## YearBuilt     3.958e+02  5.090e+01   7.775 1.64e-14 ***
## MasVnrArea    3.239e+01  6.789e+00   4.770 2.07e-06 ***
## BsmtFinSF1    1.037e+01  3.495e+00   2.967 0.003067 ** 
## X1stFlrSF     5.745e+01  5.596e+00  10.266  < 2e-16 ***
## X2ndFlrSF     4.995e+01  4.845e+00  10.309  < 2e-16 ***
## BsmtFullBath  9.782e+03  2.779e+03   3.521 0.000447 ***
## BedroomAbvGr -1.025e+04  1.902e+03  -5.388 8.61e-08 ***
## KitchenAbvGr -1.324e+04  5.656e+03  -2.340 0.019451 *  
## TotRmsAbvGrd  5.553e+03  1.398e+03   3.971 7.58e-05 ***
## Fireplaces    3.310e+03  2.070e+03   1.599 0.110111    
## GarageCars    1.101e+04  1.916e+03   5.747 1.16e-08 ***
## WoodDeckSF    2.416e+01  9.638e+00   2.507 0.012305 *  
## ScreenPorch   5.425e+01  1.969e+01   2.755 0.005956 ** 
## PoolArea     -5.982e+01  2.883e+01  -2.075 0.038214 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36680 on 1175 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.8086, Adjusted R-squared:  0.8055 
## F-statistic: 261.3 on 19 and 1175 DF,  p-value: < 2.2e-16
plot(model_house3)

# Removing 2 more columns 
model_house4 <- lm(SalePrice ~ MSSubClass + LotArea +  OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, house_train2)
summary(model_house4)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF + 
##     X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + 
##     TotRmsAbvGrd + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, 
##     data = house_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -474172  -16306   -2075   13986  299145 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.297e+05  8.859e+04  -9.366  < 2e-16 ***
## MSSubClass   -1.623e+02  2.582e+01  -6.284 4.37e-10 ***
## LotArea       4.401e-01  9.932e-02   4.431 1.01e-05 ***
## OverallQual   1.884e+04  1.121e+03  16.813  < 2e-16 ***
## OverallCond   5.066e+03  9.229e+02   5.489 4.77e-08 ***
## YearBuilt     3.868e+02  4.497e+01   8.602  < 2e-16 ***
## MasVnrArea    2.960e+01  5.876e+00   5.037 5.34e-07 ***
## BsmtFinSF1    1.020e+01  2.973e+00   3.430 0.000621 ***
## X1stFlrSF     5.910e+01  4.551e+00  12.985  < 2e-16 ***
## X2ndFlrSF     4.969e+01  4.071e+00  12.206  < 2e-16 ***
## BsmtFullBath  8.982e+03  2.374e+03   3.784 0.000161 ***
## BedroomAbvGr -1.046e+04  1.641e+03  -6.371 2.52e-10 ***
## KitchenAbvGr -1.441e+04  5.050e+03  -2.854 0.004373 ** 
## TotRmsAbvGrd  5.468e+03  1.214e+03   4.503 7.25e-06 ***
## GarageCars    1.045e+04  1.695e+03   6.165 9.13e-10 ***
## WoodDeckSF    2.516e+01  7.905e+00   3.182 0.001492 ** 
## ScreenPorch   5.435e+01  1.673e+01   3.249 0.001185 ** 
## PoolArea     -3.060e+01  2.342e+01  -1.307 0.191504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34830 on 1434 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8092, Adjusted R-squared:  0.807 
## F-statistic: 357.9 on 17 and 1434 DF,  p-value: < 2.2e-16
# Using model_house4 we'll create a scatterplot
plot(model_house4)

\(~\)

# Predict prices for test data
house_test <- read.csv('/Users/letiix3/Documents/Data Sciene MS Program/Data-605/Week-15/Submission_Prediction/test.csv')
house_test2 <- house_test %>%
  dplyr::select_if(is.numeric) %>%
  replace(is.na(.),0)
prediction <- predict(model_house4, house_test2, type = "response")
head(prediction)
##        1        2        3        4        5        6 
## 125568.8 172694.7 169683.1 198876.9 198717.8 181050.2
# Prepare data frame for submission
kaggle_prediction <- data.frame(Id = house_test2$Id, SalePrice = prediction)
head(kaggle_prediction)
##     Id SalePrice
## 1 1461  125568.8
## 2 1462  172694.7
## 3 1463  169683.1
## 4 1464  198876.9
## 5 1465  198717.8
## 6 1466  181050.2
# commenting out so it doesn't reproduce
#write.csv(kaggle_prediction, file = "submission_prediction.csv", row.names=FALSE)

\(~\)

Submission

Username: letisalba

Score: 0.34307

\(~\)

Reference for Problem 2